Package org.terrier.structures.indexing.singlepass

Source Code of org.terrier.structures.indexing.singlepass.RunsMerger$PostingComparator

/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is RunsMerger.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
*   Roi Blanco (rblanc{at}@udc.es)
*   Craig Macdonald (craigm{at}dcs.gla.ac.uk)
*/
package org.terrier.structures.indexing.singlepass;


import java.io.IOException;
import java.io.Serializable;
import java.util.Comparator;
import java.util.PriorityQueue;
import java.util.Queue;

import org.terrier.compression.BitOut;
import org.terrier.compression.BitOutputStream;
import org.terrier.structures.BasicLexiconEntry;
import org.terrier.structures.BitFilePosition;
import org.terrier.structures.FilePosition;
import org.terrier.structures.LexiconEntry;
import org.terrier.structures.LexiconOutputStream;

/**
* Merges a set of N runs using a priority queue. Each element of the queue is a {@link org.terrier.structures.indexing.singlepass.RunIterator}
* each one pointing at a different run in disk. Each run is sorted, so we only need to compare the heads of the
* element in the queue in each merging step.
* As the runs are being merged, they are written (to disk) using a {@link org.terrier.compression.BitOut}.
* @author Roi Blanco and Craig Macdonald
* @since 2.0
  */
public class RunsMerger {
 
  /**
   * Implements a comparator for RunIterators (so it can be used by the queue).
   * It decides the next reader by the lexicographical order of the terms in the top elements of the readers.
   * @author Roi Blanco and Craig Macdonald
   */
  public static class PostingComparator implements Comparator<RunIterator>, Serializable{
    /** generated by eclipse */
    private static final long serialVersionUID = 8674662139960016073L;
    @Override
    public int compare (RunIterator a, RunIterator b)
    {
      int tmp = a.current().getTerm().compareTo(b.current().getTerm());
      return tmp != 0 ? tmp : a.getRunNo() - b.getRunNo();
    }
  }
 
  /**
   * Heap for the postings coming from different runs.
   * It uses an alphabetical order using the terms
   */  
  protected Queue<RunIterator> queue;   
  /** BitOut used to write the merged postings to disk*/
  protected BitOut bos; 
  /** RunReader reference for instantiation*/
  //protected RunReader run;
  /** Last term written to disk (useful for terms appearing in mutiple runs */
  protected String lastTermWritten = "";
 
  protected LexiconEntry termStatistics = null;
 
  /** Frequency in the run of the last term merged */
  protected int lastFreq = 0;
  /**Last document written in the stream*/
  protected int lastDocument = 0;
  /** Last document's frequency*/
  protected int lastDocFreq = 0;
  /** RunReader reference for merging */
  protected RunIterator myRun;
  /** Number of terms written */
  protected int currentTerm = 0;
  /** Number of pointers written */
  protected int numberOfPointers = 0;

  protected BitFilePosition startOffset = new FilePosition(0l,(byte)0);

 
  protected RunIteratorFactory runsSource;
  /**
   * constructor
   * @param _runsSource
   */
  public RunsMerger(RunIteratorFactory _runsSource)
  {
    runsSource = _runsSource;
  }
 
  /**
   * @return the last frequency written.
   */
  public int getLastFreq(){
    return lastFreq;
  }
 
  /**
   * @return the last document frequency written.
   */
  public int getLastDocFreq(){
    return lastDocFreq;
 
 
  /**
   * @return the number of terms written.
   */
  public int getNumberOfTerms(){
    return currentTerm;
  }
 
  /**
   * @return the number of pointers written.
   */
  public int getNumberOfPointers(){
    return numberOfPointers;
  }
 
  /** Indicates whether the merging is done or not
   * @return true if there are no more elements to merge
   */
  public boolean isDone(){
    return queue.isEmpty();
  }
 
  /**
   * @return the byte offset in the BitOut (used for lexicon writting)
   */
  public long getByteOffset(){
    return bos.getByteOffset();
    //return bos.getBitOffset() == 0? bos.getByteOffset() - 1: bos.getByteOffset();
  }
 
  /**
   * @return the bit offset in the BitOut (used for lexicon writting)
   */
  public byte getBitOffset(){
    return bos.getBitOffset();
    //return bos.getBitOffset() == 0 ? (byte)7 : bos.getBitOffset() - (byte)1;
  }
 
  /**
   * @return the String with the last term written to disk.
   */
  public String getLastTermWritten() {
    return lastTermWritten;
  }
 
  /**
   * Setter for the last term written.
   * @param _lastTermWritten String with the last term written.
   */
  public void setLastTermWritten(String _lastTermWritten) {
    this.lastTermWritten = _lastTermWritten;
  }
 
  /**
   * Begins the merge, initilialising the structures.
   * Notice that the file names must be in order of run-id 
   * @param size number of runs in disk.
   * @param fileName String with the file name of the final inverted file.
   * @throws IOException if an I/O error occurs.
   */
  protected void init(int size, String fileName) throws Exception{
    this.init(size, new BitOutputStream(fileName));
  }
 
  protected void init(int size, BitOut invertedFile) throws Exception{
    queue = new PriorityQueue<RunIterator>(size, new PostingComparator());
    bos = invertedFile;
    for(int i = 0; i < size; i++){ 
      RunIterator run = runsSource.createRunIterator(i);
      run.next();
      queue.add(run);
    }
  }
 
  /**
   * Begins the multiway merging phase.
   * @param size number of runs to be merged.
   * @param fileName output filename.
   * @throws Exception if an I/O error occurs.
   */
  public void beginMerge(int size, String fileName) throws Exception{   
    init(size, fileName);
    myRun = queue.poll();
    while(myRun.current().getTerm().equals(" ")) myRun = queue.poll();   
    lastDocument = myRun.current().append(bos,-1);
    termStatistics = myRun.current().getLexiconEntry();
    lastFreq = myRun.current().getTF();
    lastDocFreq = myRun.current().getDf()
    lastTermWritten = myRun.current().getTerm();
    if(myRun.hasNext()){
      myRun.next();     
      queue.add(myRun);     
    }else{
      myRun.close();
    }   
  }
 
  /**
   * Mergers one term in the runs. If a run is exhausted, it is closed and removed from the queue.
   * @param lexStream LexiconOutputStream used to write the lexicon.
   * @throws Exception if an I/O error occurs.
   */
  public void mergeOne(LexiconOutputStream<String> lexStream) throws Exception{   
    myRun = queue.poll();
    if(myRun.current().getTerm().equals(lastTermWritten)){
      // append the term --> keep the data in memory
      lastDocument = myRun.current().append(bos, lastDocument);
      myRun.current().addToLexiconEntry(termStatistics);
      lastFreq += myRun.current().getTF();
      lastDocFreq += myRun.current().getDf();
     
    }else{     
      //write this term to the lexicon
      termStatistics.setTermId(currentTerm++);
      ((BasicLexiconEntry)termStatistics).setOffset(startOffset);
      lexStream.writeNextEntry(lastTermWritten, termStatistics);
      //record the start offset of the next term
      startOffset.setOffset(this.getByteOffset(), this.getBitOffset());
      //get the information of the next term from the Run
      numberOfPointers += lastDocFreq;
      lastDocument = myRun.current().append(bos,-1);
      termStatistics = myRun.current().getLexiconEntry();
      lastFreq = myRun.current().getTF();
      lastDocFreq = myRun.current().getDf();
      lastTermWritten = myRun.current().getTerm();
     
    }
    if(myRun.hasNext()){
      myRun.next();
      queue.add(myRun);
    }else{
      myRun.close();
    }
  }
 
  /**
   * Ends the merging phase, writes the last entry and closes the streams.
   * @param lexStream LexiconOutputStream used to write the lexicon.
   * @throws IOException if an I/O error occurs. 
   */ 
  public void endMerge(LexiconOutputStream<String> lexStream) throws IOException{
    termStatistics.setTermId(currentTerm++);
    ((BasicLexiconEntry)termStatistics).setOffset(startOffset);
    lexStream.writeNextEntry(lastTermWritten, termStatistics);
   
    //lexStream.writeNextEntry(lastTermWritten, new BasicLexiconEntry(currentTerm++, lastDocFreq, lastFreq, startOffset));
    //startOffset.setPosition(this.getByteOffset(), this.getBitOffset());
    numberOfPointers += lastDocFreq;
    bos.close();
    myRun.close();
 

  /**
   * getBos
   * @return BitOut
   */
  public BitOut getBos() {
    return bos;
  }

  /**
   * setBos
   * @param _bos
   */
  public void setBos(BitOut _bos) {
    this.bos = _bos;
  }
}
TOP

Related Classes of org.terrier.structures.indexing.singlepass.RunsMerger$PostingComparator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.